import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn as sns
from sklearn import tree
from sklearn import ensemble
import itertools
from sklearn import metrics
from sklearn import model_selection
import pvlib
import cs_detection
import utils
import visualize_plotly as visualize
sns.set_style("white")
matplotlib.rcParams['figure.figsize'] = (20., 8.)
from IPython.display import Image
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import cufflinks as cf
cf.go_offline()
init_notebook_mode(connected=True)
%load_ext autoreload
%autoreload 2
np.set_printoptions(precision=4)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
matplotlib.rcParams.update({'font.size': 16})
import warnings
# warnings.filterwarnings(action='ignore')
import xgboost as xgb
plt.close('all')
import pygal
The previous two sections show promising results going from default to cleaned data. We see marked improvements in scoring and the confusion matrix. The features used in both cases are highly correlated, though. Here we will look at feature importances in a different way - by removing 1 at a time and recalculating the F1 score.
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
train_obj.trim_dates(None, '01-01-2015')
# train_obj.features_ = features
test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
test_obj.trim_dates('01-01-2015', None)
# test_obj.features_ = features
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42)
clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50)
pred = test_obj.predict(clf)
test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50)
f1_score = metrics.f1_score(test_obj.df[test_obj.df['quality_mask']]['sky_status'], pred[test_obj.df['quality_mask']])
# print('Full features'.format([i for i in features]))
print(' F1: {}'.format(f1_score))
vis = visualize.Visualizer()
vis.plot_corr_matrix(test_obj.df[test_obj.features_].corr().values, test_obj.features_)
cm = metrics.confusion_matrix(test_obj.df[test_obj.df['quality_mask']]['sky_status'], pred[test_obj.df['quality_mask']])
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'))
fig, ax = plt.subplots(figsize=(12, 8))
_ = ax.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
_ = ax.set_xticks(range(len(clf.feature_importances_)))
_ = ax.set_xticklabels(test_obj.features_, rotation=45)
_ = ax.set_ylabel('Importance')
_ = ax.set_xlabel('Feature')
_ = fig.tight_layout()
# fig, ax = plt.subplots(figsize=(12, 8))
nsrdb_mask = test_obj.df['sky_status'].values
trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10})
trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10})
trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10})
# _ = ax.legend(bbox_to_anchor=(1.25, 1))
# _ = ax.set_xlabel('Date')
# _ = ax.set_ylabel('GHI / Wm$^{-2}$')
iplot([trace1, trace2, trace3, trace4, trace5])
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
detect_obj.trim_dates('10-01-2015', '11-01-2015')
# detect_obj.features_ = features
pred = detect_obj.predict(clf)
trace1 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=detect_obj.df[pred].index, y=detect_obj.df[pred]['GHI'], name='Clear', mode='markers', marker={'size': 10})
iplot([trace1, trace2, trace3])
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
detect_obj.trim_dates('10-01-2015', '11-01-2015')
detect_obj.features_ = features
detect_obj.downsample(5)
pred = detect_obj.predict(clf)
trace1 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=detect_obj.df[pred].index, y=detect_obj.df[pred]['GHI'], name='Clear', mode='markers', marker={'size': 10})
iplot([trace1, trace2, trace3])
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
detect_obj.trim_dates('10-01-2015', '11-01-2015')
detect_obj.features_ = features
detect_obj.downsample(10)
pred = detect_obj.predict(clf)
trace1 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=detect_obj.df[pred].index, y=detect_obj.df[pred]['GHI'], name='Clear', mode='markers', marker={'size': 10})
iplot([trace1, trace2, trace3])
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
detect_obj.trim_dates('10-01-2015', '11-01-2015')
detect_obj.features_ = features
detect_obj.downsample(15)
pred = detect_obj.predict(clf)
trace1 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=detect_obj.df[pred].index, y=detect_obj.df[pred]['GHI'], name='Clear', mode='markers', marker={'size': 10})
iplot([trace1, trace2, trace3])
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
detect_obj.trim_dates('10-01-2015', '11-01-2015')
detect_obj.features_ = features
detect_obj.downsample(30)
pred = detect_obj.predict(clf)
trace1 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=detect_obj.df[pred].index, y=detect_obj.df[pred]['GHI'], name='Clear', mode='markers', marker={'size': 10})
iplot([trace1, trace2, trace3])
test_obj.df.index[-1]
The previous two sections show promising results going from default to cleaned data. We see marked improvements in scoring and the confusion matrix. The features used in both cases are highly correlated, though. Here we will look at feature importances in a different way - by removing 1 at a time and recalculating the F1 score.